library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for data visualization
library(stringr) # for string functionality
library(cluster) # for general clustering algorithms
library(factoextra) # for visualizing cluster results
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(dplyr)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
##
## spec
ggmap::register_google(key = "AIzaSyDjdKw8npxWHoH5eL85_OrfxTUSyf8Wxcg")
mnist <- dslabs::read_mnist()
set.seed(1)
sample_1 = sample(60000,10000)
features <- mnist$train$images[sample_1,]
# Use k-means model with 10 centers and 10 random starts
mnist_clustering <- kmeans(features, centers = 10, nstart = 10)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Print contents of the model output
str(mnist_clustering)
## List of 9
## $ cluster : int [1:10000] 7 7 6 9 1 10 5 9 5 10 ...
## $ centers : num [1:10, 1:784] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "1" "2" "3" "4" ...
## .. ..$ : NULL
## $ totss : num 3.42e+10
## $ withinss : num [1:10] 1.02e+09 2.25e+09 2.42e+09 2.59e+09 3.70e+09 ...
## $ tot.withinss: num 2.54e+10
## $ betweenss : num 8.78e+09
## $ size : int [1:10] 734 810 759 914 1246 1043 1408 703 938 1445
## $ iter : int 11
## $ ifault : int 2
## - attr(*, "class")= chr "kmeans"
# Extract cluster centers
mnist_centers <- mnist_clustering$centers
# Create mode function
mode_fun <- function(x){ which.max(tabulate(x))
}
mnist_comparison <- data.frame(
cluster = mnist_clustering$cluster,
actual = mnist$train$labels[sample_1]
) %>%
group_by(cluster) %>%
mutate(mode = mode_fun(actual)) %>%
ungroup() %>%
mutate_all(factor, levels = 0:9)
# Create confusion matrix and plot results
yardstick::conf_mat(
mnist_comparison,
truth = actual,
estimate = mode
) %>%
autoplot(type = 'heatmap')
m1 = prcomp(features, scale = F)
newdata = m1$x
actual = mnist$train$labels[sample_1]
ggplot(data.frame(newdata),aes(x=PC1,y=PC2))+geom_text(aes(label=actual,color = factor(actual)),alpha=0.8)
## From the plot above we can identidy that digit 0 are well seperated and digit 1 are not
df = crime
#head(df)
dfselected <- df$offense=="robbery"|df$offense=="aggravated assault"|df$offense=="rape"|df$offense=="murder"
df2 <- df[dfselected,]
geocode("Houston")
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Houston&key=xxx
## # A tibble: 1 x 2
## lon lat
## <dbl> <dbl>
## 1 -95.4 29.8
Houston.map = get_googlemap(center=c(lon = -95.369345,lat = 29.760155),lon=c(-95.39681,-95.34188),lat = c(29.73631,29.78400),zoom = calc_zoom(c(-95.39681,-95.34188),c(29.73631,29.78400)), scale = 2)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=29.760155,-95.369345&zoom=14&size=640x640&scale=2&maptype=terrain&key=xxx
p = ggmap(Houston.map)
p+ geom_point(data = df2, aes(x=lon,y=lat,colour = factor(offense)),size = 1,alpha=1,na.rm = TRUE)
p+ geom_point(data = df2, aes(x=lon,y=lat,colour = factor(offense)),size = 1,alpha=0.8,na.rm = TRUE)+facet_wrap(~offense,nrow = 2)
data = read_csv("question3.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## FIPS = col_double(),
## County = col_character(),
## population = col_double(),
## cases = col_double(),
## lat = col_double(),
## lon = col_double()
## )
data$log <- log10(data$cases)
head(data)
## # A tibble: 6 x 8
## X1 FIPS County population cases lat lon log
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 10001 Kent County 180786 503 39.1 -75.5 2.70
## 2 2 10003 New Castle County 558753 1352 39.6 -75.6 3.13
## 3 3 10005 Sussex County 234225 1317 38.7 -75.3 3.12
## 4 4 1001 Autauga County 55869 32 32.5 -86.6 1.51
## 5 5 1003 Baldwin County 223234 132 30.7 -87.7 2.12
## 6 6 1005 Barbour County 24686 29 31.9 -85.4 1.46
us <- c(left = -125, bottom = 25.75, right = -67, top = 49)
US.map = get_stamenmap(us, zoom = 5, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/5/4/10.png
## Source : http://tile.stamen.com/toner-lite/5/5/10.png
## Source : http://tile.stamen.com/toner-lite/5/6/10.png
## Source : http://tile.stamen.com/toner-lite/5/7/10.png
## Source : http://tile.stamen.com/toner-lite/5/8/10.png
## Source : http://tile.stamen.com/toner-lite/5/9/10.png
## Source : http://tile.stamen.com/toner-lite/5/10/10.png
## Source : http://tile.stamen.com/toner-lite/5/4/11.png
## Source : http://tile.stamen.com/toner-lite/5/5/11.png
## Source : http://tile.stamen.com/toner-lite/5/6/11.png
## Source : http://tile.stamen.com/toner-lite/5/7/11.png
## Source : http://tile.stamen.com/toner-lite/5/8/11.png
## Source : http://tile.stamen.com/toner-lite/5/9/11.png
## Source : http://tile.stamen.com/toner-lite/5/10/11.png
## Source : http://tile.stamen.com/toner-lite/5/4/12.png
## Source : http://tile.stamen.com/toner-lite/5/5/12.png
## Source : http://tile.stamen.com/toner-lite/5/6/12.png
## Source : http://tile.stamen.com/toner-lite/5/7/12.png
## Source : http://tile.stamen.com/toner-lite/5/8/12.png
## Source : http://tile.stamen.com/toner-lite/5/9/12.png
## Source : http://tile.stamen.com/toner-lite/5/10/12.png
## Source : http://tile.stamen.com/toner-lite/5/4/13.png
## Source : http://tile.stamen.com/toner-lite/5/5/13.png
## Source : http://tile.stamen.com/toner-lite/5/6/13.png
## Source : http://tile.stamen.com/toner-lite/5/7/13.png
## Source : http://tile.stamen.com/toner-lite/5/8/13.png
## Source : http://tile.stamen.com/toner-lite/5/9/13.png
## Source : http://tile.stamen.com/toner-lite/5/10/13.png
p = ggmap(US.map)
p+ geom_point(data=data,aes(x=lon,y=lat),colour = "#b207f0",size = data$log,alpha=0.3,na.rm = TRUE)